The dataset used in this project is part of the Yelp Dataset Challenge 2018 (Round 12). The dataset contains a set of JSON files that include business information, reviews, tips (shorter reviews), user information and check-ins.
The data consists of six sub datasets which describes the data with a brief information.
The Yelp dataset includes the following sub files:
As we are working with a relatively large dataset, which contains information related to different business types, this project is focused on establishing a business idea or use case for the exploitation of this dataset.
In order to identify this use case, the first step will include the analysis of the dataset to have a first idea of the data with which we are dealing and thus be able to establish the use case.
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from collections import Counter
import plotly.express as px
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def get_nulls(df):
df_dtypes = pd.merge(df.isnull().sum(axis = 0).sort_values().to_frame('missing_value').reset_index(),
df.dtypes.to_frame('feature_type').reset_index(),
on = 'index',
how = 'inner')
return df_dtypes.sort_values(['missing_value', 'feature_type'])
def get_outliers(df):
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
return ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()
def get_businesses_types(df):
business_overall = df.assign(categories = df.categories.str.split(', ')).explode('categories')
categories = business_overall.categories.value_counts()
categories = categories.to_frame().reset_index()
categories.columns = ['Categories', 'Count']
return categories.head(20)
def get_top_10_businesses(df):
business_overall = df.assign(categories = df.categories.str.split(', ')).explode('categories')
categories = business_overall.categories.value_counts()
categories = categories.to_frame().reset_index()
categories.columns = ['Categories', 'Count']
categories= categories.sort_values(by = 'Count', ascending = False)
categories = categories[0:10]
objects = list(categories.Categories)
y_pos = np.arange(len(objects))
Numbers = list(categories.Count)
fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(111)
plt.bar(y_pos, Numbers, align='center', alpha=1)
plt.xticks(y_pos, objects)
plt.ylabel('Number of Occurences')
plt.title('Top 10 Businesses in Yelp dataset')
plt.show()
def get_ratings_numbers(df):
business_overall = df.assign(stars = df.stars.explode('stars'))
stars = business_overall.stars.value_counts()
stars = stars.to_frame().reset_index()
stars.columns = ['Stars', 'Count']
return stars
def get_ratings_distribution(df):
plt.hist(df.stars, bins=np.linspace(1.,5.5,10))
plt.xlabel('Stars')
plt.ylabel('# of Businesses')
plt.title('Businesses by Stars')
def get_categories_names(df):
for i, col in enumerate(df.columns):
df.rename(columns={col: f'Category {i+1}'}, inplace=True)
def get_nighlife_businesses(df):
business_overall = business_df_preprocessed.assign(categories = business_df_preprocessed.categories.str.split(', ')).explode('categories')
nightlife_mask = business_overall['categories'].isin(['Nightlife'])
nightlife_businesses = business_overall[nightlife_mask]
return nightlife_businesses
def get_state_businesses(df):
pa_mask = df['state'] == 'PA'
pa_nightlife_businesses = df[pa_mask]
return pa_nightlife_businesses
def get_PA_geovisualization(df):
base = county_small.plot(color='white', edgecolor='black', figsize=(40, 40))
county_small.apply(lambda x: base.annotate(s=x.COUNTY_NAM, xy=x.geometry.centroid.coords[0], ha='center'
, fontsize=25),axis=1);
gdf.plot(ax=base, marker='o', color='lightblue', markersize=5, );
def get_business_visualization(df):
df_state_business = df.groupby('state').size().reset_index(name='num_businesses')
fig = px.choropleth(df_state_business,
locations='state',
locationmode="USA-states",
scope="usa",
color='num_businesses',
color_continuous_scale=px.colors.sequential.Plasma,
hover_name='num_businesses',
title='Total Opened Businesses by state',
)
fig.update_layout(geo = dict(showlakes=False))
fig.show()
def get_nightlife_business_visualization(df):
df_state_business = df.groupby('state').size().reset_index(name='num_businesses')
fig = px.choropleth(df_state_business,
locations='state',
locationmode="USA-states",
scope="usa",
color='num_businesses',
color_continuous_scale=px.colors.sequential.Plasma,
hover_name='num_businesses',
title='Total Nightlife Businesses by state',
)
fig.update_layout(geo = dict(showlakes=False))
fig.show()
business_json_path = '../data/Raw/yelp_academic_dataset_business.json'
nightlife_business_csv_outpath = '../data/Processed/nightlife_business_PA.csv'
PA_shp = '../data/Raw/shp/PaCounty2022_11.shp'
warnings.filterwarnings('ignore')
business_df = pd.read_json(business_json_path ,lines= True)
business_df.head()
| business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | attributes | categories | hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Pns2l4eNsfO8kk83dixA6A | Abby Rappoport, LAC, CMQ | 1616 Chapala St, Ste 2 | Santa Barbara | CA | 93101 | 34.426679 | -119.711197 | 5.0 | 7 | 0 | {'ByAppointmentOnly': 'True'} | Doctors, Traditional Chinese Medicine, Naturop... | None |
| 1 | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | 87 Grasso Plaza Shopping Center | Affton | MO | 63123 | 38.551126 | -90.335695 | 3.0 | 15 | 1 | {'BusinessAcceptsCreditCards': 'True'} | Shipping Centers, Local Services, Notaries, Ma... | {'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ... |
| 2 | tUFrWirKiKi_TAnsVWINQQ | Target | 5255 E Broadway Blvd | Tucson | AZ | 85711 | 32.223236 | -110.880452 | 3.5 | 22 | 0 | {'BikeParking': 'True', 'BusinessAcceptsCredit... | Department Stores, Shopping, Fashion, Home & G... | {'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ... |
| 3 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | 935 Race St | Philadelphia | PA | 19107 | 39.955505 | -75.155564 | 4.0 | 80 | 1 | {'RestaurantsDelivery': 'False', 'OutdoorSeati... | Restaurants, Food, Bubble Tea, Coffee & Tea, B... | {'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ... |
| 4 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | 101 Walnut St | Green Lane | PA | 18054 | 40.338183 | -75.471659 | 4.5 | 13 | 1 | {'BusinessAcceptsCreditCards': 'True', 'Wheelc... | Brewpubs, Breweries, Food | {'Wednesday': '14:0-22:0', 'Thursday': '16:0-2... |
As we can see, we are dealing with a dataset (business yelp dataset) that includes different types of businesses. Each row would correspond to a business.
print("In total we have", business_df.shape[0], "businesses and ", business_df.shape[1], ' variables')
In total we have 150346 businesses and 14 variables
This information is indicative to see what type of dataset we are dealing with. I mention this, since we find variables grouped in the attributes variables for example. Therefore, the number of variables can vary and increase later.
business_df.business_id.describe()
count 150346 unique 150346 top Pns2l4eNsfO8kk83dixA6A freq 1 Name: business_id, dtype: object
As we can see, the business dataset we are working with does not contain any duplicates. We have the same number of ids and unique entries. Thus there are no duplicate entries since there is no recurring business ids
get_nulls(business_df)
| index | missing_value | feature_type | |
|---|---|---|---|
| 9 | review_count | 0 | int64 |
| 10 | is_open | 0 | int64 |
| 6 | latitude | 0 | float64 |
| 7 | longitude | 0 | float64 |
| 8 | stars | 0 | float64 |
| 0 | business_id | 0 | object |
| 1 | name | 0 | object |
| 2 | address | 0 | object |
| 3 | city | 0 | object |
| 4 | state | 0 | object |
| 5 | postal_code | 0 | object |
| 11 | categories | 103 | object |
| 12 | attributes | 13744 | object |
| 13 | hours | 23223 | object |
We have 103 null values in the categories variable, 23223 nullls in the hours variable and 13744 in the attributes variable. We will see later how we treat this nulll values after selecting the use case for this dataset.
outliers_numbers = get_outliers(business_df)
outliers_numbers
address 0 attributes 0 business_id 0 categories 0 city 0 hours 0 is_open 30648 latitude 5573 longitude 22953 name 0 postal_code 0 review_count 18302 stars 0 state 0 dtype: int64
We can see that there are outliers in 4 specific variables: longitude, latitude and review_count. For the outliers corresponding to longitude and latitude, I consider that it is not necessary to make any adjustments since they can be businesses located in Alaska, for example, and consider it as an outlier. In any case, we will carry out a geospatial visualization to better observe this data. Regarding review_count, we will also analyze this since they can be businesses with fake reviews, many reviews because they are good businesses, or they can be new businesses and have no reviews. is_open variable also includes outliers, which may correspond to closed businesses.
As with null values, once the use case is identified, we will deal with outliers.
business_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150346 entries, 0 to 150345 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 business_id 150346 non-null object 1 name 150346 non-null object 2 address 150346 non-null object 3 city 150346 non-null object 4 state 150346 non-null object 5 postal_code 150346 non-null object 6 latitude 150346 non-null float64 7 longitude 150346 non-null float64 8 stars 150346 non-null float64 9 review_count 150346 non-null int64 10 is_open 150346 non-null int64 11 attributes 136602 non-null object 12 categories 150243 non-null object 13 hours 127123 non-null object dtypes: float64(3), int64(2), object(9) memory usage: 16.1+ MB
business_df.dtypes.sort_values().to_frame('feature_type').groupby(by = 'feature_type').size().to_frame('count').reset_index()
| feature_type | count | |
|---|---|---|
| 0 | int64 | 2 |
| 1 | float64 | 3 |
| 2 | object | 9 |
We have 2 int variables, 3 float variables and 9 object variables. However, as mentioned earlier this will possibly change after.
It is very relevant to show the distribution of this variables because we will be probably working only with opened businesses.
business_df.groupby('is_open').business_id.count()
is_open 0 30648 1 119698 Name: business_id, dtype: int64
As we can see, there are around 30k businesses closed. We only need to focus on the businesses which are open ( i.e. business.is_open ==1)
business_df.drop(business_df[business_df.is_open == 0].index, inplace=True)
business_df.groupby('is_open').business_id.count()
is_open 1 119698 Name: business_id, dtype: int64
get_businesses_types(business_df)
| Categories | Count | |
|---|---|---|
| 0 | Restaurants | 34987 |
| 1 | Food | 20419 |
| 2 | Shopping | 20186 |
| 3 | Home Services | 13322 |
| 4 | Beauty & Spas | 12263 |
| 5 | Health & Medical | 11046 |
| 6 | Local Services | 10138 |
| 7 | Automotive | 9879 |
| 8 | Nightlife | 8379 |
| 9 | Event Planning & Services | 8173 |
| 10 | Bars | 7528 |
| 11 | Active Life | 6496 |
| 12 | Sandwiches | 6075 |
| 13 | American (Traditional) | 5531 |
| 14 | Fast Food | 5516 |
| 15 | Hotels & Travel | 5123 |
| 16 | Pizza | 5090 |
| 17 | Home & Garden | 5022 |
| 18 | Auto Repair | 5003 |
| 19 | Coffee & Tea | 4954 |
get_top_10_businesses(business_df)
There are 1310 types of businesses, restaurants being the most important type of business. Following restaurants, we have food and shopping. I consider important to highlight that restaurants and food are related, therefore we must keep these two categories in order to carry out the model
get_ratings_numbers(business_df)
| Stars | Count | |
|---|---|---|
| 0 | 4.0 | 19267 |
| 1 | 4.5 | 17962 |
| 2 | 3.5 | 15776 |
| 3 | 5.0 | 11571 |
| 4 | 3.0 | 10927 |
| 5 | 2.5 | 8949 |
| 6 | 2.0 | 6197 |
| 7 | 1.5 | 3314 |
| 8 | 1.0 | 1334 |
get_ratings_distribution(business_df)
Most of the reviews are collected in a 4-star rating. 5 stars would be the third most important and 1 star the least important.
pd.DataFrame(Counter(business_df.city.values).most_common(10), columns=['city','businesses'])
| city | businesses | |
|---|---|---|
| 0 | Philadelphia | 10542 |
| 1 | Tucson | 7533 |
| 2 | Tampa | 7219 |
| 3 | Indianapolis | 5894 |
| 4 | Nashville | 5398 |
| 5 | Reno | 4762 |
| 6 | New Orleans | 4649 |
| 7 | Edmonton | 3916 |
| 8 | Saint Louis | 3403 |
| 9 | Santa Barbara | 3020 |
From this, we see the statewise distribution of businesses.Philadelphia is the city with most opened businesses followed by Tucson. In order to solve the established problem, we will have to focus on a specific city
review_count = business_df[['city', 'review_count', 'stars']].groupby(['city']).agg({'review_count': 'sum', 'stars': 'mean'}).sort_values(by='review_count', ascending=False)
review_count.head(10)
| review_count | stars | |
|---|---|---|
| city | ||
| Philadelphia | 720331 | 3.647031 |
| New Orleans | 525055 | 3.851904 |
| Nashville | 368792 | 3.649315 |
| Tampa | 367198 | 3.597590 |
| Tucson | 326434 | 3.621532 |
| Indianapolis | 281539 | 3.591958 |
| Reno | 280921 | 3.794519 |
| Santa Barbara | 218360 | 4.126490 |
| Saint Louis | 189410 | 3.622686 |
| Boise | 90250 | 3.740349 |
From this, we see the statewise distribution of reviews and stars by city. As we can see, Philadelphia, being the top state by business count, is also ranked as the state with the most reviews and with a mean of 3,6 stars for its businesses.
pd.DataFrame(Counter(business_df.state.values).most_common(), columns=['state','businesses'])
| state | businesses | |
|---|---|---|
| 0 | PA | 26289 |
| 1 | FL | 21540 |
| 2 | TN | 9600 |
| 3 | IN | 8946 |
| 4 | MO | 8363 |
| 5 | AZ | 8108 |
| 6 | LA | 7676 |
| 7 | NJ | 7031 |
| 8 | NV | 6277 |
| 9 | AB | 4346 |
| 10 | CA | 4065 |
| 11 | ID | 3783 |
| 12 | DE | 1894 |
| 13 | IL | 1765 |
| 14 | TX | 4 |
| 15 | WA | 2 |
| 16 | MA | 2 |
| 17 | HI | 1 |
| 18 | UT | 1 |
| 19 | CO | 1 |
| 20 | MI | 1 |
| 21 | SD | 1 |
| 22 | XMS | 1 |
| 23 | VT | 1 |
Pensilvania state is the state with most opened businesses.
get_business_visualization(business_df)
We have decided to include this geospatial visualization showing the distribution of businesses among the different states. Overall, geospatial visualization can be a powerful tool for exploring and understanding geographical data, and it is an important aspect of data analysis and visualization in many fields.
In this case, the yelp dataset does not include businesses for all the states of the United States. Thus, there are missing states which are shown with the grey colour. The map is interactive, therefore, we can see the number of businesses for each state simply by hovering the mouse over an state.
Taking into account the previous exploratory analysis of the data that we have carried out, we propose the following problem and idea of use for the dataset with which we are working.
Social media customer reviews have a significant impact on any business's prospects of success. When selecting a business between different options, customers search for a comprehensive and pleasant experience in terms of the service quality, ambiance, other´s users experiences... They also frequently ask other customers for recommendations. Users of Yelp can easily access this data.
In our case, we aim to focus on the nightlife sector among the many different types of businesses gathered in the yelp dataset.
In addition, a service like Yelp gathers among thousands of reviews each day. Summarizing or extracting specific pieces of information from such a big corpus is a challenging task.
Data mining and more concretely text mining techniques allow us to explore a massive corpus like the one of Yelp reviews. We can obtain new insights about the text content that may be helpful for customers, nightlife businesses owners, government or even for Yelp.
Therefore, the Yelp Dataset contains a lot of review data: text, rating and stars. Thus, this project aims to train a model, that given a set of keywords based on the review of the users, in order to predict the rating of a review. To do so, we will implement NLP and kind of sentiment analysis when building the model.
In this project, we have focused on the following two datasets:
Business dataset containing business objects, list name, location, opening hours, category, average star rating, the number of reviews about the business and a series of attributes like noise level or reservations policy.
Reviews dataset containing reviews objects list, a star rating, the review text, the review date, and the number of votes that the review has received.
To do so, we have began with the relevant exploratory data analysis which has allowed us to discover the type of data with which we have to work. This has allowed us to know the data, establish a use case or business idea for the exploitation of the data.
Later we will carry out the preprocessing of the data that will allow us to adjust the data to be able to work with them for the creation of the recommendation system.
After conducting the exploratory analysis, we trimmed the yelp dataset only for Pensilvania state, including nightlife related categories since we find the largest number of businesses distributed between these category.
We selected instances with:
As we are trimming the dataset by business category(we have selected the 8th State with most openend businesses) and state, I have concluded that choosing Pensilvania would be the best option since, it is the state with most nightlife businesses(As we have trimmed a lot the dataset choosing the nightlife sector we decided to choose the first state by businesses so that we have a sufficient number of reviews to work with). Therefore, in total we would be working with a total of 252555 reviews.
Establishing this use case will allow us to reduce the dataset with which we work and thus be able to take better advantage of computing power.
Following the analysis of the dataset corresponding to the business, we have decided to carry out an initial preprocessing of the dataset to adjust the dataset based on the requirements we need.
For this, and as we were mentioning before, it is necessary to adjust 3 categories found in nested form.
To do so, we will create a new variable that will store the variables in nested form. The function pd.series allows us to convert, in this case, a list in nested form inside the variables into a series. Then we will concatenate the different variables created to the business dataframe.
hours = business_df['hours'].apply(pd.Series)
hours
| Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | |
|---|---|---|---|---|---|---|---|
| 1 | 0:0-0:0 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-14:0 | NaN |
| 3 | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-21:0 | 7:0-21:0 | 7:0-21:0 |
| 4 | NaN | NaN | 14:0-22:0 | 16:0-22:0 | 12:0-22:0 | 12:0-22:0 | 12:0-18:0 |
| 5 | 0:0-0:0 | 6:0-22:0 | 6:0-22:0 | 6:0-22:0 | 9:0-0:0 | 9:0-22:0 | 8:0-22:0 |
| 6 | 0:0-0:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 12:0-18:0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-17:30 | 11:0-17:0 |
| 150342 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:0-17:0 | 0:0-16:0 |
| 150343 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150344 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-17:0 | 10:0-17:0 |
| 150345 | NaN | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | NaN |
119698 rows × 7 columns
attributes = business_df['attributes'].apply(pd.Series)
attributes
| BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BusinessParking | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | False | False | False | {'garage': False, 'street': True, 'validated':... | True | 1 | True | False | u'free' | u'none' | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | True | NaN | NaN | {'garage': None, 'street': None, 'validated': ... | True | NaN | True | NaN | NaN | NaN | False | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | True | True | True | None | False | 1 | True | False | u'no' | u'none' | False | True | True | u'casual' | False | None | False | False | False | True | True | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | True | NaN | NaN | {'garage': False, 'street': False, 'validated'... | True | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | NaN | NaN | NaN | NaN | NaN | 3 | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150342 | True | NaN | NaN | {'garage': False, 'street': False, 'validated'... | True | 2 | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150343 | True | NaN | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150344 | True | None | NaN | {'garage': False, 'street': False, 'validated'... | True | 4 | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150345 | True | NaN | NaN | {'garage': False, 'street': False, 'validated'... | False | 1 | NaN | False | u'free' | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
119698 rows × 39 columns
attributes.drop('BusinessParking', axis = 1, inplace = True)
attributes
| BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | False | False | False | True | 1 | True | False | u'free' | u'none' | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | True | NaN | NaN | True | NaN | True | NaN | NaN | NaN | False | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | True | True | True | False | 1 | True | False | u'no' | u'none' | False | True | True | u'casual' | False | None | False | False | False | True | True | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | True | NaN | NaN | True | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | NaN | NaN | NaN | NaN | 3 | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150342 | True | NaN | NaN | True | 2 | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150343 | True | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150344 | True | None | NaN | True | 4 | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150345 | True | NaN | NaN | False | 1 | NaN | False | u'free' | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
119698 rows × 38 columns
We delete business parking variable, which is also stored in nested form, as we consider that this variable is not relevant for the study that we propose.
We concatenate both variables, attributes and hours, to have a more accesible variable.
attributes_hours_concat = pd.concat([attributes, hours], axis = 1)
attributes_hours_concat
| BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-14:0 | NaN |
| 3 | False | False | False | True | 1 | True | False | u'free' | u'none' | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-21:0 | 7:0-21:0 | 7:0-21:0 |
| 4 | True | NaN | NaN | True | NaN | True | NaN | NaN | NaN | False | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14:0-22:0 | 16:0-22:0 | 12:0-22:0 | 12:0-22:0 | 12:0-18:0 |
| 5 | True | True | True | False | 1 | True | False | u'no' | u'none' | False | True | True | u'casual' | False | None | False | False | False | True | True | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 6:0-22:0 | 6:0-22:0 | 6:0-22:0 | 9:0-0:0 | 9:0-22:0 | 8:0-22:0 |
| 6 | True | NaN | NaN | True | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 12:0-18:0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | NaN | NaN | NaN | NaN | 3 | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-17:30 | 11:0-17:0 |
| 150342 | True | NaN | NaN | True | 2 | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:0-17:0 | 0:0-16:0 |
| 150343 | True | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150344 | True | None | NaN | True | 4 | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-17:0 | 10:0-17:0 |
| 150345 | True | NaN | NaN | False | 1 | NaN | False | u'free' | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | NaN |
119698 rows × 45 columns
Then, we add the variable attributes_hours_concat to the business_df, resulting in a new df.
business_df_preprocessed = pd.concat([attributes_hours_concat, business_df], axis = 1)
business_df_preprocessed
| BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | attributes | categories | hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-14:0 | NaN | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | 87 Grasso Plaza Shopping Center | Affton | MO | 63123 | 38.551126 | -90.335695 | 3.0 | 15 | 1 | {'BusinessAcceptsCreditCards': 'True'} | Shipping Centers, Local Services, Notaries, Ma... | {'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ... |
| 3 | False | False | False | True | 1 | True | False | u'free' | u'none' | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-21:0 | 7:0-21:0 | 7:0-21:0 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | 935 Race St | Philadelphia | PA | 19107 | 39.955505 | -75.155564 | 4.0 | 80 | 1 | {'RestaurantsDelivery': 'False', 'OutdoorSeati... | Restaurants, Food, Bubble Tea, Coffee & Tea, B... | {'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ... |
| 4 | True | NaN | NaN | True | NaN | True | NaN | NaN | NaN | False | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14:0-22:0 | 16:0-22:0 | 12:0-22:0 | 12:0-22:0 | 12:0-18:0 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | 101 Walnut St | Green Lane | PA | 18054 | 40.338183 | -75.471659 | 4.5 | 13 | 1 | {'BusinessAcceptsCreditCards': 'True', 'Wheelc... | Brewpubs, Breweries, Food | {'Wednesday': '14:0-22:0', 'Thursday': '16:0-2... |
| 5 | True | True | True | False | 1 | True | False | u'no' | u'none' | False | True | True | u'casual' | False | None | False | False | False | True | True | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 6:0-22:0 | 6:0-22:0 | 6:0-22:0 | 9:0-0:0 | 9:0-22:0 | 8:0-22:0 | CF33F8-E6oudUQ46HnavjQ | Sonic Drive-In | 615 S Main St | Ashland City | TN | 37015 | 36.269593 | -87.058943 | 2.0 | 6 | 1 | {'BusinessParking': 'None', 'BusinessAcceptsCr... | Burgers, Fast Food, Sandwiches, Food, Ice Crea... | {'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '... |
| 6 | True | NaN | NaN | True | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 12:0-18:0 | n_0UpQx1hsNbnPUSlodU8w | Famous Footwear | 8522 Eager Road, Dierbergs Brentwood Point | Brentwood | MO | 63144 | 38.627695 | -90.340465 | 2.5 | 13 | 1 | {'BusinessAcceptsCreditCards': 'True', 'Restau... | Sporting Goods, Fashion, Shoe Stores, Shopping... | {'Monday': '0:0-0:0', 'Tuesday': '10:0-18:0', ... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | NaN | NaN | NaN | NaN | 3 | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-17:30 | 11:0-17:0 | IUQopTMmYQG-qRtBk-8QnA | Binh's Nails | 3388 Gateway Blvd | Edmonton | AB | T6J 5H2 | 53.468419 | -113.492054 | 3.0 | 13 | 1 | {'ByAppointmentOnly': 'False', 'RestaurantsPri... | Nail Salons, Beauty & Spas | {'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3... |
| 150342 | True | NaN | NaN | True | 2 | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:0-17:0 | 0:0-16:0 | c8GjPIOTGVmIemT7j5_SyQ | Wild Birds Unlimited | 2813 Bransford Ave | Nashville | TN | 37204 | 36.115118 | -86.766925 | 4.0 | 5 | 1 | {'BusinessAcceptsCreditCards': 'True', 'Restau... | Pets, Nurseries & Gardening, Pet Stores, Hobby... | {'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3... |
| 150343 | True | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | _QAMST-NrQobXduilWEqSw | Claire's Boutique | 6020 E 82nd St, Ste 46 | Indianapolis | IN | 46250 | 39.908707 | -86.065088 | 3.5 | 8 | 1 | {'RestaurantsPriceRange2': '1', 'BusinessAccep... | Shopping, Jewelry, Piercing, Toy Stores, Beaut... | None |
| 150344 | True | None | NaN | True | 4 | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-17:0 | 10:0-17:0 | mtGm22y5c2UHNXDFAjaPNw | Cyclery & Fitness Center | 2472 Troy Rd | Edwardsville | IL | 62025 | 38.782351 | -89.950558 | 4.0 | 24 | 1 | {'BusinessParking': '{'garage': False, 'street... | Fitness/Exercise Equipment, Eyewear & Optician... | {'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ... |
| 150345 | True | NaN | NaN | False | 1 | NaN | False | u'free' | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | NaN | jV_XOycEzSlTx-65W906pg | Sic Ink | 238 Apollo Beach Blvd | Apollo beach | FL | 33572 | 27.771002 | -82.394910 | 4.5 | 9 | 1 | {'WheelchairAccessible': 'True', 'BusinessAcce... | Beauty & Spas, Permanent Makeup, Piercing, Tattoo | {'Tuesday': '12:0-19:0', 'Wednesday': '12:0-19... |
119698 rows × 59 columns
We delete the original variables.
business_df_preprocessed.drop('attributes', axis = 1, inplace = True)
business_df_preprocessed.drop('hours', axis = 1, inplace = True)
business_df_preprocessed
| BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | categories | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-14:0 | NaN | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | 87 Grasso Plaza Shopping Center | Affton | MO | 63123 | 38.551126 | -90.335695 | 3.0 | 15 | 1 | Shipping Centers, Local Services, Notaries, Ma... |
| 3 | False | False | False | True | 1 | True | False | u'free' | u'none' | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-21:0 | 7:0-21:0 | 7:0-21:0 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | 935 Race St | Philadelphia | PA | 19107 | 39.955505 | -75.155564 | 4.0 | 80 | 1 | Restaurants, Food, Bubble Tea, Coffee & Tea, B... |
| 4 | True | NaN | NaN | True | NaN | True | NaN | NaN | NaN | False | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14:0-22:0 | 16:0-22:0 | 12:0-22:0 | 12:0-22:0 | 12:0-18:0 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | 101 Walnut St | Green Lane | PA | 18054 | 40.338183 | -75.471659 | 4.5 | 13 | 1 | Brewpubs, Breweries, Food |
| 5 | True | True | True | False | 1 | True | False | u'no' | u'none' | False | True | True | u'casual' | False | None | False | False | False | True | True | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 6:0-22:0 | 6:0-22:0 | 6:0-22:0 | 9:0-0:0 | 9:0-22:0 | 8:0-22:0 | CF33F8-E6oudUQ46HnavjQ | Sonic Drive-In | 615 S Main St | Ashland City | TN | 37015 | 36.269593 | -87.058943 | 2.0 | 6 | 1 | Burgers, Fast Food, Sandwiches, Food, Ice Crea... |
| 6 | True | NaN | NaN | True | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 12:0-18:0 | n_0UpQx1hsNbnPUSlodU8w | Famous Footwear | 8522 Eager Road, Dierbergs Brentwood Point | Brentwood | MO | 63144 | 38.627695 | -90.340465 | 2.5 | 13 | 1 | Sporting Goods, Fashion, Shoe Stores, Shopping... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | NaN | NaN | NaN | NaN | 3 | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-17:30 | 11:0-17:0 | IUQopTMmYQG-qRtBk-8QnA | Binh's Nails | 3388 Gateway Blvd | Edmonton | AB | T6J 5H2 | 53.468419 | -113.492054 | 3.0 | 13 | 1 | Nail Salons, Beauty & Spas |
| 150342 | True | NaN | NaN | True | 2 | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:0-17:0 | 0:0-16:0 | c8GjPIOTGVmIemT7j5_SyQ | Wild Birds Unlimited | 2813 Bransford Ave | Nashville | TN | 37204 | 36.115118 | -86.766925 | 4.0 | 5 | 1 | Pets, Nurseries & Gardening, Pet Stores, Hobby... |
| 150343 | True | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | _QAMST-NrQobXduilWEqSw | Claire's Boutique | 6020 E 82nd St, Ste 46 | Indianapolis | IN | 46250 | 39.908707 | -86.065088 | 3.5 | 8 | 1 | Shopping, Jewelry, Piercing, Toy Stores, Beaut... |
| 150344 | True | None | NaN | True | 4 | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-17:0 | 10:0-17:0 | mtGm22y5c2UHNXDFAjaPNw | Cyclery & Fitness Center | 2472 Troy Rd | Edwardsville | IL | 62025 | 38.782351 | -89.950558 | 4.0 | 24 | 1 | Fitness/Exercise Equipment, Eyewear & Optician... |
| 150345 | True | NaN | NaN | False | 1 | NaN | False | u'free' | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | NaN | jV_XOycEzSlTx-65W906pg | Sic Ink | 238 Apollo Beach Blvd | Apollo beach | FL | 33572 | 27.771002 | -82.394910 | 4.5 | 9 | 1 | Beauty & Spas, Permanent Makeup, Piercing, Tattoo |
119698 rows × 57 columns
categories = business_df_preprocessed['categories'].str.split(',', expand=True)
categories.head(50)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Shipping Centers | Local Services | Notaries | Mailbox Centers | Printing Services | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 3 | Restaurants | Food | Bubble Tea | Coffee & Tea | Bakeries | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 4 | Brewpubs | Breweries | Food | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 5 | Burgers | Fast Food | Sandwiches | Food | Ice Cream & Frozen Yogurt | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 6 | Sporting Goods | Fashion | Shoe Stores | Shopping | Sports Wear | Accessories | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 7 | Synagogues | Religious Organizations | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 9 | Ice Cream & Frozen Yogurt | Fast Food | Burgers | Restaurants | Food | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 10 | Department Stores | Shopping | Fashion | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 11 | Vietnamese | Food | Restaurants | Food Trucks | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 12 | American (Traditional) | Restaurants | Diners | Breakfast & Brunch | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 13 | General Dentistry | Dentists | Health & Medical | Cosmetic Dentists | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 15 | Sushi Bars | Restaurants | Japanese | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 16 | Automotive | Auto Parts & Supplies | Auto Customization | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 17 | Vape Shops | Tobacco Shops | Personal Shopping | Vitamins & Supplements | Shopping | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 18 | Automotive | Car Rental | Hotels & Travel | Truck Rental | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 19 | Korean | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 21 | Shopping | Books | Mags | Music & Video | Bookstores | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 22 | Steakhouses | Asian Fusion | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 23 | Restaurants | Italian | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 25 | Pet Services | Pet Groomers | Pets | Veterinarians | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 26 | Women's Clothing | Accessories | Children's Clothing | Men's Clothing | Adult | Shopping | Fashion | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 29 | Pizza | Chicken Wings | Sandwiches | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 30 | Trampoline Parks | Active Life | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 32 | Dance Wear | Sports Wear | Children's Clothing | Arts & Entertainment | Social Clubs | Performing Arts | Sporting Goods | Shoe Stores | Fashion | Shopping | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 33 | Pizza | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 34 | Event Planning & Services | Hotels | Hotels & Travel | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 35 | Eatertainment | Arts & Entertainment | Brewpubs | American (Traditional) | Bakeries | Breweries | Food | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 37 | Mobile Phones | Telecommunications | Electronics | Mobile Phone Accessories | Local Services | Shopping | IT Services & Computer Repair | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 38 | Museums | Kids Activities | Arts & Entertainment | Education | Active Life | Playgrounds | Children's Museums | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 39 | Musicians | DJs | Karaoke | Event Planning & Services | Nightlife | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 40 | Hair Salons | Hair Extensions | Beauty & Spas | Wigs | Shopping | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 41 | Restaurants | Specialty Food | Steakhouses | Food | Italian | Pizza | Pasta Shops | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 42 | DUI Law | Professional Services | Lawyers | Criminal Defense Law | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 43 | Health & Medical | Beauty & Spas | Laser Hair Removal | Doctors | Hair Removal | Chiropractors | Weight Loss Centers | Sports Medicine | Medical Spas | Skin Care | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 46 | Arts & Entertainment | Music Venues | Internet Service Providers | Nightlife | Food | Coffee & Tea | Jazz & Blues | Professional Services | Internet Cafes | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 48 | Shipping Centers | Local Services | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 49 | Restaurants | Italian | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 51 | Real Estate Agents | Home Services | Real Estate | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 52 | Specialty Food | Bakeries | Food | Health Markets | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 54 | Food | Grocery | Convenience Stores | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 55 | Hotels | Hotels & Travel | Event Planning & Services | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 56 | Vitamins & Supplements | Ice Cream & Frozen Yogurt | Food | Juice Bars & Smoothies | Shopping | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 57 | Ophthalmologists | Eyewear & Opticians | Health & Medical | Shopping | Doctors | Optometrists | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 61 | Sports Bars | American (New) | American (Traditional) | Nightlife | Bars | Restaurants | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 63 | Uniforms | Shopping | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 65 | Hotels & Travel | Tours | Local Flavor | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 66 | Local Services | Appliances & Repair | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 67 | Towing | Hotels & Travel | Automotive | Motorcycle Rental | Motorcycle Gear | Shopping | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 68 | Chocolatiers & Shops | Specialty Food | Candy Stores | Food | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 70 | Food Trucks | Food | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
len(categories)
119698
get_nulls(categories)
| index | missing_value | feature_type | |
|---|---|---|---|
| 0 | 0 | 95 | object |
| 1 | 1 | 376 | object |
| 2 | 2 | 25243 | object |
| 3 | 3 | 46457 | object |
| 4 | 4 | 67464 | object |
| 5 | 5 | 86565 | object |
| 6 | 6 | 99841 | object |
| 7 | 7 | 108187 | object |
| 8 | 8 | 113018 | object |
| 9 | 9 | 115987 | object |
| 10 | 10 | 117642 | object |
| 11 | 11 | 118517 | object |
| 12 | 12 | 119042 | object |
| 13 | 13 | 119355 | object |
| 14 | 14 | 119507 | object |
| 15 | 15 | 119579 | object |
| 16 | 16 | 119629 | object |
| 17 | 17 | 119661 | object |
| 18 | 18 | 119680 | object |
| 19 | 19 | 119688 | object |
| 20 | 20 | 119690 | object |
| 21 | 21 | 119692 | object |
| 22 | 22 | 119693 | object |
| 23 | 23 | 119695 | object |
| 24 | 24 | 119695 | object |
| 25 | 25 | 119696 | object |
| 26 | 33 | 119697 | object |
| 27 | 32 | 119697 | object |
| 28 | 31 | 119697 | object |
| 29 | 30 | 119697 | object |
| 30 | 26 | 119697 | object |
| 31 | 28 | 119697 | object |
| 32 | 27 | 119697 | object |
| 33 | 34 | 119697 | object |
| 34 | 29 | 119697 | object |
| 35 | 35 | 119697 | object |
As we can see, the category variable included a list containing 35 subvariables. As it would be counterproductive to include the 35 variables, we have decided to observe from which category the nulls increase. In this way, we select the variables that are most relevant so as not to include variables with a large presence of nulls.
categories = categories.iloc[:, :8]
categories
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|
| 1 | Shipping Centers | Local Services | Notaries | Mailbox Centers | Printing Services | None | None | None |
| 3 | Restaurants | Food | Bubble Tea | Coffee & Tea | Bakeries | None | None | None |
| 4 | Brewpubs | Breweries | Food | None | None | None | None | None |
| 5 | Burgers | Fast Food | Sandwiches | Food | Ice Cream & Frozen Yogurt | Restaurants | None | None |
| 6 | Sporting Goods | Fashion | Shoe Stores | Shopping | Sports Wear | Accessories | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | Nail Salons | Beauty & Spas | None | None | None | None | None | None |
| 150342 | Pets | Nurseries & Gardening | Pet Stores | Hobby Shops | Bird Shops | Home & Garden | Shopping | None |
| 150343 | Shopping | Jewelry | Piercing | Toy Stores | Beauty & Spas | Accessories | Fashion | None |
| 150344 | Fitness/Exercise Equipment | Eyewear & Opticians | Shopping | Sporting Goods | Bikes | None | None | None |
| 150345 | Beauty & Spas | Permanent Makeup | Piercing | Tattoo | None | None | None | None |
119698 rows × 8 columns
Now, for further convenience, lets rename the columns. Since we do not know clearly what each category is, we are going to give it a standard name.
get_categories_names(categories)
categories
| Category 1 | Category 2 | Category 3 | Category 4 | Category 5 | Category 6 | Category 7 | Category 8 | |
|---|---|---|---|---|---|---|---|---|
| 1 | Shipping Centers | Local Services | Notaries | Mailbox Centers | Printing Services | None | None | None |
| 3 | Restaurants | Food | Bubble Tea | Coffee & Tea | Bakeries | None | None | None |
| 4 | Brewpubs | Breweries | Food | None | None | None | None | None |
| 5 | Burgers | Fast Food | Sandwiches | Food | Ice Cream & Frozen Yogurt | Restaurants | None | None |
| 6 | Sporting Goods | Fashion | Shoe Stores | Shopping | Sports Wear | Accessories | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | Nail Salons | Beauty & Spas | None | None | None | None | None | None |
| 150342 | Pets | Nurseries & Gardening | Pet Stores | Hobby Shops | Bird Shops | Home & Garden | Shopping | None |
| 150343 | Shopping | Jewelry | Piercing | Toy Stores | Beauty & Spas | Accessories | Fashion | None |
| 150344 | Fitness/Exercise Equipment | Eyewear & Opticians | Shopping | Sporting Goods | Bikes | None | None | None |
| 150345 | Beauty & Spas | Permanent Makeup | Piercing | Tattoo | None | None | None | None |
119698 rows × 8 columns
business_df_preprocessed = pd.concat([categories, business_df_preprocessed], axis = 1)
business_df_preprocessed
| Category 1 | Category 2 | Category 3 | Category 4 | Category 5 | Category 6 | Category 7 | Category 8 | BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | categories | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Shipping Centers | Local Services | Notaries | Mailbox Centers | Printing Services | None | None | None | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-18:30 | 8:0-14:0 | NaN | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | 87 Grasso Plaza Shopping Center | Affton | MO | 63123 | 38.551126 | -90.335695 | 3.0 | 15 | 1 | Shipping Centers, Local Services, Notaries, Ma... |
| 3 | Restaurants | Food | Bubble Tea | Coffee & Tea | Bakeries | None | None | None | False | False | False | True | 1 | True | False | u'free' | u'none' | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-20:0 | 7:0-21:0 | 7:0-21:0 | 7:0-21:0 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | 935 Race St | Philadelphia | PA | 19107 | 39.955505 | -75.155564 | 4.0 | 80 | 1 | Restaurants, Food, Bubble Tea, Coffee & Tea, B... |
| 4 | Brewpubs | Breweries | Food | None | None | None | None | None | True | NaN | NaN | True | NaN | True | NaN | NaN | NaN | False | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14:0-22:0 | 16:0-22:0 | 12:0-22:0 | 12:0-22:0 | 12:0-18:0 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | 101 Walnut St | Green Lane | PA | 18054 | 40.338183 | -75.471659 | 4.5 | 13 | 1 | Brewpubs, Breweries, Food |
| 5 | Burgers | Fast Food | Sandwiches | Food | Ice Cream & Frozen Yogurt | Restaurants | None | None | True | True | True | False | 1 | True | False | u'no' | u'none' | False | True | True | u'casual' | False | None | False | False | False | True | True | False | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 6:0-22:0 | 6:0-22:0 | 6:0-22:0 | 9:0-0:0 | 9:0-22:0 | 8:0-22:0 | CF33F8-E6oudUQ46HnavjQ | Sonic Drive-In | 615 S Main St | Ashland City | TN | 37015 | 36.269593 | -87.058943 | 2.0 | 6 | 1 | Burgers, Fast Food, Sandwiches, Food, Ice Crea... |
| 6 | Sporting Goods | Fashion | Shoe Stores | Shopping | Sports Wear | Accessories | None | None | True | NaN | NaN | True | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 10:0-18:0 | 12:0-18:0 | n_0UpQx1hsNbnPUSlodU8w | Famous Footwear | 8522 Eager Road, Dierbergs Brentwood Point | Brentwood | MO | 63144 | 38.627695 | -90.340465 | 2.5 | 13 | 1 | Sporting Goods, Fashion, Shoe Stores, Shopping... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150341 | Nail Salons | Beauty & Spas | None | None | None | None | None | None | NaN | NaN | NaN | NaN | 3 | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-19:30 | 10:0-17:30 | 11:0-17:0 | IUQopTMmYQG-qRtBk-8QnA | Binh's Nails | 3388 Gateway Blvd | Edmonton | AB | T6J 5H2 | 53.468419 | -113.492054 | 3.0 | 13 | 1 | Nail Salons, Beauty & Spas |
| 150342 | Pets | Nurseries & Gardening | Pet Stores | Hobby Shops | Bird Shops | Home & Garden | Shopping | None | True | NaN | NaN | True | 2 | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:30-17:30 | 9:0-17:0 | 0:0-16:0 | c8GjPIOTGVmIemT7j5_SyQ | Wild Birds Unlimited | 2813 Bransford Ave | Nashville | TN | 37204 | 36.115118 | -86.766925 | 4.0 | 5 | 1 | Pets, Nurseries & Gardening, Pet Stores, Hobby... |
| 150343 | Shopping | Jewelry | Piercing | Toy Stores | Beauty & Spas | Accessories | Fashion | None | True | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | _QAMST-NrQobXduilWEqSw | Claire's Boutique | 6020 E 82nd St, Ste 46 | Indianapolis | IN | 46250 | 39.908707 | -86.065088 | 3.5 | 8 | 1 | Shopping, Jewelry, Piercing, Toy Stores, Beaut... |
| 150344 | Fitness/Exercise Equipment | Eyewear & Opticians | Shopping | Sporting Goods | Bikes | None | None | None | True | None | NaN | True | 4 | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-20:0 | 9:0-17:0 | 10:0-17:0 | mtGm22y5c2UHNXDFAjaPNw | Cyclery & Fitness Center | 2472 Troy Rd | Edwardsville | IL | 62025 | 38.782351 | -89.950558 | 4.0 | 24 | 1 | Fitness/Exercise Equipment, Eyewear & Optician... |
| 150345 | Beauty & Spas | Permanent Makeup | Piercing | Tattoo | None | None | None | None | True | NaN | NaN | False | 1 | NaN | False | u'free' | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | 12:0-19:0 | NaN | jV_XOycEzSlTx-65W906pg | Sic Ink | 238 Apollo Beach Blvd | Apollo beach | FL | 33572 | 27.771002 | -82.394910 | 4.5 | 9 | 1 | Beauty & Spas, Permanent Makeup, Piercing, Tattoo |
119698 rows × 65 columns
Once the dataset has been adjusted, the next step will involve the selection of the type of business that we want to deal, according to the use case and the problem that we have identified.
For this, we have decided, as previously mentioned, to focus on the Nightlife sector.
get_businesses_types(business_df_preprocessed)
| Categories | Count | |
|---|---|---|
| 0 | Restaurants | 34987 |
| 1 | Food | 20419 |
| 2 | Shopping | 20186 |
| 3 | Home Services | 13322 |
| 4 | Beauty & Spas | 12263 |
| 5 | Health & Medical | 11046 |
| 6 | Local Services | 10138 |
| 7 | Automotive | 9879 |
| 8 | Nightlife | 8379 |
| 9 | Event Planning & Services | 8173 |
As we can see, this sector collects a large part of the businesses, ranking 8th with most businesses. Taking into account the size of the reviews dataset, we consider it as a good option, since we will have enough reviews to adjust and perform the model, without abusing too much on computing power.
len(business_df_preprocessed)
119698
nightlife_businesses = get_nighlife_businesses(business_df_preprocessed)
We used the previous function to create a new dataframe containing only the businesses related to the nightlife sector. It creates a new column called categories. This new column contains the respective category in separate rows, so that each row represents a single category for a business.
Hence, we can easily filter by 'nightlife' to obtain the desired dataframe.
nightlife_businesses.drop('categories', axis = 1, inplace = True)
nightlife_businesses
| Category 1 | Category 2 | Category 3 | Category 4 | Category 5 | Category 6 | Category 7 | Category 8 | BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39 | Musicians | DJs | Karaoke | Event Planning & Services | Nightlife | None | None | None | True | NaN | NaN | NaN | NaN | NaN | NaN | u'no' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9:0-17:0 | 9:0-17:0 | 9:0-17:0 | 9:0-17:0 | 9:0-17:0 | 9:0-17:0 | 9:0-17:0 | fSCNwMtNNQY9QT69Cj9fiA | Sierra Pro Events | Sparks | NV | 89431 | 39.540154 | -119.748395 | 5.0 | 7 | 1 | |
| 46 | Arts & Entertainment | Music Venues | Internet Service Providers | Nightlife | Food | Coffee & Tea | Jazz & Blues | Professional Services | NaN | NaN | False | True | 2 | True | NaN | u'free' | u'none' | False | True | NaN | NaN | NaN | NaN | False | False | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | u'no' | {'dj': False, 'background_music': False, 'no_m... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11:0-1:0 | 11:0-1:0 | 11:0-1:0 | 11:0-1:0 | 11:0-1:0 | 19:0-1:0 | 19:0-1:0 | JX4tUpd09YFchLBuI43lGw | Naked Cyber Cafe & Espresso Bar | 10303 108 Street NW | Edmonton | AB | T5J 1L7 | 53.544682 | -113.506589 | 4.0 | 12 | 1 |
| 61 | Sports Bars | American (New) | American (Traditional) | Nightlife | Bars | Restaurants | None | None | True | False | True | False | 2 | True | NaN | 'free' | 'full_bar' | False | NaN | True | 'casual' | False | {u'divey': False, u'hipster': None, u'casual':... | False | False | True | True | True | True | NaN | {'dessert': None, 'latenight': None, 'lunch': ... | 'average' | NaN | NaN | u'no' | {'dj': False} | False | {u'monday': False, u'tuesday': True, u'wednesd... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 11:0-0:0 | 11:0-0:0 | 11:0-22:0 | 11:0-19:0 | 11:0-1:0 | 11:0-0:0 | seKihQKpGGnCeLuELRQPSQ | Twin Peaks | 6880 E 82nd St | Indianapolis | IN | 46250 | 39.906295 | -86.047463 | 3.5 | 257 | 1 |
| 73 | American (Traditional) | Bars | Nightlife | Sports Bars | Restaurants | None | None | None | True | False | False | True | 1 | True | NaN | 'free' | u'full_bar' | False | NaN | False | u'casual' | False | {'romantic': False, 'intimate': False, 'classy... | NaN | NaN | NaN | True | True | NaN | NaN | NaN | u'average' | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8rb-3VYXE37IZix4yOdskw | Sharky's Sports Bar & Grill | 820 N Black Horse Pike | Williamstown | NJ | 08094 | 39.696801 | -74.999821 | 2.5 | 29 | 1 |
| 75 | American (Traditional) | Sports Bars | Restaurants | Bars | Nightlife | Steakhouses | Salad | Beer Bar | True | True | None | True | NaN | True | NaN | u'free' | u'full_bar' | True | True | NaN | NaN | False | {'touristy': False, 'hipster': False, 'romanti... | False | NaN | True | NaN | True | True | NaN | {'dessert': False, 'latenight': False, 'lunch'... | NaN | NaN | NaN | u'no' | {'dj': False, 'background_music': False, 'no_m... | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 11:0-0:0 | 11:0-1:0 | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | 11:0-23:0 | aCDY7vXYMs54EbYuQScsnQ | 39 North Taproom & Grill | 110 S MacDade Blvd | Glenolden | PA | 19036 | 39.903697 | -75.294981 | 4.5 | 25 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150269 | Nightlife | Cafes | Hotels | Bars | Hotels & Travel | Restaurants | Event Planning & Services | None | NaN | False | NaN | NaN | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | False | {'touristy': False, 'hipster': False, 'romanti... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11:0-0:0 | 11:0-0:0 | 11:0-0:0 | 11:0-0:0 | 11:0-2:0 | 11:0-2:0 | NaN | 2dVJ7R-3JMmu2v4DJYtBbw | Spring Mount Hotel | 3 Main St | Schwenksville | PA | 19473 | 40.275532 | -75.456772 | 2.0 | 5 | 1 |
| 150274 | Sports Bars | Nightlife | Pubs | Bars | None | None | None | None | True | NaN | True | True | 1 | NaN | NaN | 'free' | u'full_bar' | NaN | NaN | NaN | NaN | False | {'touristy': False, 'hipster': None, 'romantic... | False | NaN | NaN | True | True | True | NaN | NaN | u'loud' | NaN | NaN | u'outdoor' | {'dj': None, 'background_music': False, 'no_mu... | False | {'monday': False, 'tuesday': False, 'friday': ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11:0-0:0 | 11:0-0:0 | 11:0-0:0 | 11:0-0:0 | 11:0-1:0 | 11:0-1:0 | 11:0-0:0 | 9iCum5W48TqNVDTZCJUKjg | Plantation Pub | 8321 Sawyer Brown Rd | Bellevue | TN | 37221 | 36.070748 | -86.947891 | 3.5 | 40 | 1 |
| 150275 | Bars | Tapas Bars | Restaurants | Nightlife | Gastropubs | Lounges | Cocktail Bars | None | True | False | True | True | 2 | False | NaN | 'no' | 'full_bar' | False | True | False | 'casual' | False | {'touristy': False, 'hipster': True, 'romantic... | False | NaN | True | True | True | True | NaN | {'dessert': False, 'latenight': False, 'lunch'... | u'average' | NaN | NaN | u'outdoor' | {'dj': False, 'background_music': False, 'no_m... | False | {'monday': True, 'tuesday': False, 'friday': T... | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 17:0-2:0 | 17:0-2:0 | 17:0-2:0 | 14:0-2:0 | 11:0-2:0 | 11:0-0:0 | IeSD0nMKRFYUTnR5nZH1CQ | HighWire Lounge | 14 S Arizona Ave | Tucson | AZ | 85701 | 32.221828 | -110.967969 | 3.5 | 111 | 1 |
| 150292 | Bars | Beer Bar | Nightlife | Wine Bars | Pizza | Restaurants | None | None | NaN | NaN | True | NaN | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | u'casual' | NaN | {u'divey': False, u'hipster': None, u'casual':... | False | NaN | True | True | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | {u'dj': None, u'live': False, u'jukebox': None... | NaN | {u'monday': False, u'tuesday': False, u'wednes... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11:30-23:0 | 11:30-23:0 | 11:30-23:0 | 11:30-23:0 | 11:30-0:30 | 11:30-0:30 | 11:30-23:0 | esBGrrmuZzSiECyRBoKvvA | Colony Grill - St. Petersburg | 670 Central Ave | St. Petersburg | FL | 33701 | 27.770872 | -82.643069 | 4.5 | 38 | 1 |
| 150323 | Bars | Gastropubs | Sandwiches | Nightlife | Restaurants | None | None | None | True | None | None | True | 2 | None | NaN | u'free' | u'full_bar' | False | NaN | False | 'casual' | False | {'touristy': False, 'hipster': None, 'romantic... | NaN | False | NaN | True | True | NaN | True | {'dessert': None, 'latenight': True, 'lunch': ... | u'average' | NaN | NaN | NaN | {'dj': False, 'background_music': False, 'no_m... | False | {'monday': False, 'tuesday': False, 'friday': ... | NaN | NaN | 'no' | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 11:0-22:0 | 11:0-22:0 | 11:0-23:0 | 11:0-23:30 | 11:0-23:0 | 11:0-22:0 | w_4xUt-1AyY2ZwKtnjW0Xg | Bittercreek Alehouse | 246 N 8th St | Boise | ID | 83702 | 43.616590 | -116.202383 | 4.5 | 998 | 1 |
8379 rows × 64 columns
len(nightlife_businesses)
8379
The newly generated dataframe contains the exact number of businesses identified in the previous analysis as nightllife businesses.
get_nightlife_business_visualization(nightlife_businesses)
As we can see, not all the states included in the yelp dataset have nightlife businesses. The objective is to transform it into a more workable dataframe taking into account the use case and computing power.
pd.DataFrame(Counter(nightlife_businesses.state.values).most_common(), columns=['state','Nightlife businesses'])
| state | Nightlife businesses | |
|---|---|---|
| 0 | PA | 1716 |
| 1 | FL | 1490 |
| 2 | LA | 834 |
| 3 | TN | 816 |
| 4 | MO | 745 |
| 5 | IN | 638 |
| 6 | AZ | 383 |
| 7 | NV | 378 |
| 8 | AB | 326 |
| 9 | NJ | 323 |
| 10 | ID | 258 |
| 11 | CA | 198 |
| 12 | IL | 147 |
| 13 | DE | 127 |
nightlife_businesses_PA = get_state_businesses(nightlife_businesses)
nightlife_businesses_PA.head()
| Category 1 | Category 2 | Category 3 | Category 4 | Category 5 | Category 6 | Category 7 | Category 8 | BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | Ambience | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | GoodForMeal | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | Music | GoodForDancing | BestNights | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 75 | American (Traditional) | Sports Bars | Restaurants | Bars | Nightlife | Steakhouses | Salad | Beer Bar | True | True | None | True | NaN | True | NaN | u'free' | u'full_bar' | True | True | NaN | NaN | False | {'touristy': False, 'hipster': False, 'romanti... | False | NaN | True | NaN | True | True | NaN | {'dessert': False, 'latenight': False, 'lunch'... | NaN | NaN | NaN | u'no' | {'dj': False, 'background_music': False, 'no_m... | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 11:0-0:0 | 11:0-1:0 | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | 11:0-23:0 | aCDY7vXYMs54EbYuQScsnQ | 39 North Taproom & Grill | 110 S MacDade Blvd | Glenolden | PA | 19036 | 39.903697 | -75.294981 | 4.5 | 25 | 1 |
| 292 | Italian | Restaurants | Salad | Bars | Venues & Event Spaces | Wine Bars | Nightlife | Event Planning & Services | NaN | NaN | True | NaN | 2 | NaN | NaN | u'free' | u'full_bar' | NaN | NaN | NaN | u'casual' | True | {u'divey': False, u'hipster': False, u'casual'... | NaN | NaN | True | True | NaN | True | NaN | NaN | u'average' | NaN | NaN | NaN | NaN | NaN | {u'monday': False, u'tuesday': False, u'wednes... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11:30-21:0 | 11:30-21:0 | 11:30-21:0 | 11:30-22:0 | 11:30-22:0 | 11:30-21:0 | rZw9O5lJ36m_mXeRKE4G9A | La Sponda | 20 E Lancaster Ave | Downingtown | PA | 19335 | 40.006208 | -75.704451 | 4.5 | 29 | 1 |
| 340 | Nightlife | Pubs | American (Traditional) | Restaurants | Bars | None | None | None | True | True | True | True | 2 | True | NaN | u'free' | u'full_bar' | False | True | False | 'casual' | False | {'touristy': False, 'hipster': None, 'romantic... | False | True | True | True | True | True | NaN | {'dessert': False, 'latenight': True, 'lunch':... | u'average' | NaN | NaN | u'outdoor' | {'dj': False, 'background_music': False, 'no_m... | False | {'monday': False, 'tuesday': False, 'friday': ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 11:0-22:0 | 11:0-22:0 | 11:0-22:0 | 11:0-23:30 | 11:0-23:30 | 11:0-20:0 | xJyp6RLqNRv3tSu6njPKxQ | World of Beer | 102 Main St, Ste 100 | Exton | PA | 19341 | 40.023665 | -75.629188 | 3.0 | 208 | 1 |
| 420 | Bars | Nightlife | Pubs | None | None | None | None | None | True | True | True | True | 2 | True | NaN | u'free' | u'full_bar' | NaN | NaN | NaN | NaN | False | {'romantic': False, 'intimate': False, 'touris... | NaN | NaN | NaN | True | True | NaN | NaN | NaN | u'loud' | NaN | NaN | NaN | {'dj': False, 'background_music': False, 'no_m... | False | {'monday': False, 'tuesday': False, 'friday': ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | 11:0-2:0 | eJ77e9lGxY3ArzaoDbHhYw | Paddy Whacks Irish Sports Pub - South Street | 150 South St | Philadelphia | PA | 19147 | 39.941054 | -75.145463 | 2.5 | 161 | 1 |
| 443 | Event Planning & Services | Whiskey Bars | American (New) | American (Traditional) | Venues & Event Spaces | Bars | Nightlife | Restaurants | True | False | True | False | 2 | True | False | u'free' | u'full_bar' | True | True | False | u'casual' | True | {'touristy': False, 'hipster': False, 'romanti... | NaN | False | True | True | True | True | NaN | {'dessert': None, 'latenight': True, 'lunch': ... | u'average' | False | NaN | u'no' | {'dj': False, 'background_music': False, 'no_m... | False | {'monday': False, 'tuesday': False, 'friday': ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0:0-0:0 | 16:0-22:0 | 16:0-22:0 | 12:0-22:0 | 16:0-0:0 | 11:0-0:0 | 10:0-2:0 | 7mpYTDb24SywNMRn3yeakQ | The Twisted Tail | 509 S 2nd St | Philadelphia | PA | 19147 | 39.941877 | -75.145199 | 4.0 | 604 | 1 |
len(nightlife_businesses_PA)
1716
review_count = nightlife_businesses_PA[['review_count', 'stars']].agg({'review_count': 'sum', 'stars': 'mean'})
review_count.head(10)
review_count 252555.000000 stars 3.621212 dtype: float64
We have a total of 252555 reviews for the nightlife businesses(a good number of reviews to work with without relying to much on computing power)
print("In total we have", nightlife_businesses_PA.shape[0], "businesses and ", nightlife_businesses_PA.shape[1], ' variables')
In total we have 1716 businesses and 64 variables
fig = px.scatter_mapbox(nightlife_businesses_PA,
lat="latitude",
lon="longitude",
size='stars',
size_max=9,
hover_name='name',
color_continuous_scale=px.colors.sequential.Rainbow,
zoom=8,
color='stars',
hover_data=['stars','review_count'],
title='Nightlife Businesses Visualization')
fig.update_layout(mapbox_style='open-street-map')
fig.show()
nightlife_businesses_PA.to_csv(nightlife_business_csv_outpath)